Within the training dataset each variable was permuted 10 times. These permutated datasets were then used to make a logistic model bagged 100 times. The bagged model was used to predict on the entire (unbalanced) training dataset. The AUC for each permuted dataset of a given variable was then averaged. This mean AUC was subtracted from the base model AUC (delta AUC). The delta AUC was rescaled max(delta AUC) to yield variable importance. This whole process took roughly 4 hours using 20 cores of the high memory machine.
PermFullModel <- readRDS("../data_out/TempSplit/PermFullModel.rds")
The population size is the most importance variable.
#model
glm.formula <- as.formula("case~
NDVI+NDVIScale+
popLog10+
RFsqrt+RFScale+
tempMean+tempScale+
fireDenSqrt+fireDenScale+
spRich+primProp")
variables <- trimws(unlist(strsplit(as.character(glm.formula)[3], "+", fixed = T)), which = "both")
variablesName <- c("full model", variables, "all permutated")
#Plot relative Importance
RelImp <- PermFullModel[[1]]
names(RelImp) <- variables
barplot(RelImp)
#rank relative importance
sort(RelImp)
## fireDenSqrt NDVI fireDenScale RFScale tempScale
## 0.03680823 0.08309142 0.13261168 0.17530674 0.18917706
## primProp RFsqrt tempMean NDVIScale spRich
## 0.19504424 0.23542957 0.23828224 0.30154515 0.54232322
## popLog10
## 1.00000000
All variables were permuted simultaneously to get at the baseline AUC of the model approach. It is roughly 0.5 but has wide variability.
AUC <- PermFullModel[[2]]
AUC$meanAUC <- as.numeric(as.character(AUC$meanAUC))#stupid format
AUC$sdAUC <- as.numeric(as.character(AUC$sdAUC))
AUC[order(AUC$meanAUC, decreasing = TRUE),]
## Model meanAUC sdAUC
## 1 full model 0.8471939 0.002230535
## 9 fireDenSqrt 0.8454479 0.002532170
## 2 NDVI 0.8432525 0.004200308
## 10 fireDenScale 0.8409035 0.003300597
## 6 RFScale 0.8388782 0.005135133
## 8 tempScale 0.8382203 0.003520869
## 12 primProp 0.8379419 0.004117191
## 5 RFsqrt 0.8360262 0.005619717
## 7 tempMean 0.8358909 0.004074073
## 3 NDVIScale 0.8328900 0.004739831
## 11 spRich 0.8214686 0.014090251
## 4 popLog10 0.7997585 0.016359660
## 13 all permutated 0.5181521 0.115364210
library(gplots)
plotCI(x=c(1:length(AUC$Model)),y=AUC$meanAUC, uiw=AUC$sdAUC, gap=0, xaxt="n", xlab="Permuted Variable", ylab="Mean AUC (10 permutations)")
axis(1, at=c(1:length(AUC$Model)), labels=AUC$Model)
case :reporting of any YF cases (0,1)
primProp : sum of each municipalities relative area that is both agricultural and falls within a primate genus range. {0,9} Missing for 2014
# Bagging ----
bagging<-function(form.x.y,training,new.data){
# modified JP's bagging function 12/1/17 RK
# form.x.y the formula for model to use
# training dataframe containing training data (presence and abs)
# new.data new data for logreg model to predict onto
# returns predictions based on logreg model for new data and coefficients of model
#0. load packages
library(dplyr)
#1. Create subset of data with fixed number of pres and abs
training.pres <- dplyr::filter(training, case==1) #pull out just present points
training.abs <- dplyr::filter(training, case==0) #pull out just absence points
training_positions.p <- sample(nrow(training.pres),size=10) #randomly choose 10 present point rows
training_positions.b <- sample(nrow(training.abs),size=100) #randomly choose 100 absence point rows
train_pos.p<-1:nrow(training.pres) %in% training_positions.p #presence
train_pos.b<-1:nrow(training.abs) %in% training_positions.b #background
#2. Build logreg model with subset of data
glm_fit<-glm(form.x.y,data=rbind(training.pres[train_pos.p,],training.abs[train_pos.b,]),family=binomial(logit))
#3. Pull out model coefs
glm.coef <- coef(glm_fit)
#4. Use model to predict (0,1) on whole training data
predictions <- predict(glm_fit,newdata=new.data,type="response")
return(list(predictions, glm.coef))
}
# Permute Variable based on loop iteration of PermOneVar ----
permutedata=function(formula = glm.formula,trainingdata, i){
# glm.formula:
# training : training data with pres and abs
# cores : number of cores to use for parallel; default to 2
# no.iterations : number of low bias models to make; default to 100
#parse out variables from formula object
variables <- trimws(unlist(strsplit(as.character(formula)[3], "+", fixed = T)), which = "both")
variablesName <- c("full model", variables, "all permutated")
#if statments to permute data as needed ----
if(i==1){
#run full model
permuted.data <- trainingdata
}else if(i==length(variablesName)){
#permute all variables; using loop so can I can use same sampling method (apply statement coherced data into weird format)
# temp.data <- dplyr::select(traindata, variables) %>%
# dplyr::sample_frac()
# permuted.data <- cbind(case=traindata$case, tmp.data)
#bug: treating colnames as colnumber in fun but not when ran in console. :(
permuted.data <- trainingdata
for( j in 1:length(variables)){
vari <- variables[j]
permuted.data[,vari] <- sample(permuted.data[,vari],dim(permuted.data)[1],FALSE) #permute the col named included in vari (ie. variable.names)
}
} else {
#permute single variable
permuted.data <- trainingdata
permuted.data[,variablesName[i]] <- sample(permuted.data[,variablesName[i]],dim(permuted.data)[1],FALSE) #permute the col named included in vari (ie. variable.names)
}
return(permuted.data)
}
permOneVar=function(formula = glm.formula, bag.fnc=bagging,permute.fnc=permutedata, traindata = training, cores=2, no.iterations= 100, perm=10){
# glm.formula:
# training : training data with pres and abs
# cores : number of cores to use for parallel; default to 2
# no.iterations : number of low bias models to make; default to 100
library(dplyr)
library(doParallel)
#parse out variables from formula object
variables <- trimws(unlist(strsplit(as.character(formula)[3], "+", fixed = T)), which = "both")
variablesName <- c("full model", variables, "all permutated")
#make objects for outputs to be saved in
perm.auc <- matrix(NA, nrow=perm, ncol=length(variablesName)) #place to save AUC of models based on different permuation
for (j in 1:length(variablesName)){
print(c(j,variablesName[j])) #let us know where the simulation is at.
VarToPerm <- j
for (k in 1:perm){
#if statments to permute data as needed, replaced with permuteddata fnc ----
permuted.data <- permute.fnc(formula=formula, trainingdata= traindata, i=VarToPerm) #permute variable of interest
print(c(j,k))
#Set up parallel cores to return 2 lists: 1) predictions, 2) coef of model ----
cores.to.use <- cores #change dep on computer
iterations <- no.iterations #number of low bias models to make
#create class to combine multiple results
multiResultClass <- function(predictions = NULL,coefs = NULL)
{
me <- list(
predictions = predictions,
coefs = coefs
)
## Set the name for the class
class(me) <- append(class(me),"multiResultClass")
return(me)
}
cl <- makeCluster(cores.to.use)
registerDoParallel(cl)
trainModel <- foreach(i=1:iterations) %dopar% {
result <- multiResultClass()
output1 <- bag.fnc(form.x.y=formula,training= permuted.data ,new.data=traindata)
result$predictions <-output1[[1]]
#result$coefs <- output1[[2]]
return(result)
}
stopCluster(cl)
#aggregate data from clusters ----
#pull out data in usable fashion
trainingPreds <- do.call(cbind,(lapply(trainModel, '[[', 1)))
#trainingCoefs <- do.call(cbind,(lapply(trainModel, '[[', 2)))
output.preds<- apply(trainingPreds, 1, mean)
preds <- prediction(output.preds, traindata$case) #other projects have used dismo::evaluate instead. Not sure if is makes a difference.
#matrix of AUC to return
perm.auc[k,j] <- unlist(performance(preds, "auc")@y.values)
}
}
#calculate relative importance
perm.auc.mean <- apply(perm.auc,2,mean)
perm.auc.sd <- apply(perm.auc, 2, sd)
delta.auc <- perm.auc.mean[1] - perm.auc.mean[-c(1, length(perm.auc.mean))] #change in AUC from base model only for single variable permutation
rel.import <- delta.auc/max(delta.auc) # normalized relative change in AUC from base model only for single variable permutation
#Output for relative importance
relative.import <- as.data.frame(cbind(Permutated=variables,RelImport=rel.import))
#plot it for fun
barplot(rel.import, names.arg = variables)
#Output for mean and sd of permutations for all permutations (non, single var, and all var)
mean.auc <- as.data.frame(cbind(Model=variablesName,meanAUC=perm.auc.mean, sdAUC=perm.auc.sd))
#Output of AUC for each permutation
colnames(perm.auc) <- variablesName
#return training coefs and AUC for each iteration
#return(list(train.auc, Coefs))
return(list(rel.import, mean.auc,perm.auc))
}
The methods for the workflow have not changed with the exception of increasing the number of bags from 100 to 1000. This analysis explores different variable combinations. Permutation3, the final model was permuted a total of 100 times (instead of the standard 10).
Drop One model exploration showed that population was the main explanatory variable. However, the models with population permutated (100bags, 10 perm) did not have a large drop in AUC. For this reason, we should explore model permutations after dropping population.
#drop one covar importance
glm.formula.popless <- as.formula("case~ NDVI+NDVIScale+
RFsqrt+RFScale+
tempMean+tempScale+
fireDenSqrt+fireDenScale+
spRich+primProp")
PoplessVariables <- trimws(unlist(strsplit(as.character(glm.formula.popless)[3], "+", fixed = T)), which = "both")
#increased number of bagged models to 1000 from 100
#PermPoplessModel <- permOneVar(formula = glm.formula.popless,traindata = training.data, cores=20, no.iterations = 1000, perm = 10 )
The stdev around the AUC for the fully permutated model is reassuring. The model performance is slightly impacted by removing population. The order of relative importance for the environmental covariates hasn’t changed.
## Model meanAUC sdAUC
## 12 all permutated 0.514 0.086
## 10 spRich 0.775 0.014
## 3 NDVIScale 0.776 0.012
## 4 RFsqrt 0.795 0.007
## 6 tempMean 0.798 0.004
## 11 primProp 0.800 0.006
## 7 tempScale 0.802 0.003
## 2 NDVI 0.803 0.003
## 9 fireDenScale 0.805 0.009
## 5 RFScale 0.806 0.002
## 8 fireDenSqrt 0.807 0.004
## 1 full model 0.811 0.001
Is the second environmental covariate really adding anything to the model? The more important of the two flavors of each covariate (based on Permutation 1) were used to build a low variable model. All fire covariates were removed as well.
#permute one covar importance, single flavor (SF) of each covariate
glm.formula.SF <- as.formula("case~ NDVIScale+
RFsqrt+
tempMean+
spRich")
SingleVariables <- trimws(unlist(strsplit(as.character(glm.formula.SF)[3], "+", fixed = T)), which = "both")
#PermSFModel <- permOneVar(formula = glm.formula.SF,traindata = training.data, cores=20, no.iterations = 1000, perm = 10 ) #increased number of bagged models to 1000 from 100
Again, the model performance isn’t strongly impacted by removing variables. However, the all permutated model has a slighted elevated mean AUC and larger error.
## Model meanAUC sdAUC
## 6 all permutated 0.547 0.132
## 5 spRich 0.744 0.029
## 3 RFsqrt 0.767 0.015
## 4 tempMean 0.777 0.011
## 2 NDVIScale 0.781 0.009
## 1 full model 0.801 0.000
Before running this model, we felt like this should be the final model. That along with the variation of the all permutated model of Permuation 2 we increased the number of permutations was increased from 10 to 100.
#permute one covar importance, single flavor (SF) of each covariate
glm.formula.SFP <- as.formula("case~ popLog10+
NDVIScale+
RFsqrt+
tempMean+
spRich")
SFPVariables <- trimws(unlist(strsplit(as.character(glm.formula.SFP)[3], "+", fixed = T)), which = "both")
#PermSFPModel <- permOneVar(formula = glm.formula.SFP,traindata = training.data, cores=20, no.iterations = 1000, perm = 100 ) #increased number of bagged models to 1000 from 100
This code took about 3.5 days to run. The model AUC looks good. The variability around the all permutated model is still high. I wonder if this is expected in models with fewer variables?
## Model meanAUC sdAUC
## 7 all permutated 0.549 0.132
## 6 spRich 0.794 0.028
## 2 popLog10 0.813 0.018
## 4 RFsqrt 0.816 0.011
## 3 NDVIScale 0.830 0.007
## 5 tempMean 0.835 0.004
## 1 full model 0.848 0.000
Map of predictions and known cases by month (created via mappingPredictions.Rmd). This is only on the training data (hence the blank municipalities), and includes known true cases as green diamonds.
We want to explore how the predictions rank, noting where known positives fall out. This plot plots until the lowest ranked known positive.
predictions <- as.data.frame(readRDS("../data_out/Predictions/SFPPredPlot.rds"))
minTrue <- min(predictions$prediction[predictions$case == 1])
predRank <- predictions %>%
#filter(prediction>minTrue) %>% #trim off predictions below the last true postive
arrange(desc(prediction))
predRank$index <- seq.int(nrow(predRank))
ggplot(data = predRank[predRank$case == 0,], aes(y = prediction, x = index)) +
geom_point(color = "gray80") +
geom_point(data = predRank[predRank$case == 1,], aes(y = prediction, x = index), color = "maroon") +
ylab("Prediction")+
xlab("Rank order")+
theme_bw() +
ggtitle("Rank Order of ALL Training Data")
truePos <- predictions[predictions$case==1,]
bgPreds <- predictions[predictions$case==0,]
The lowest prediction for a known case is 0.02. 622720 instances fall above this, out of a total of 622720. The mean value given to true positives is 0.317 0.267 sd, compared to 0.073 0.107 sd for the background points.
Histograms of the predictions are as follows:
hist(truePos$prediction, main = "Predictions on True Positives")
hist(bgPreds$prediction, main = "Predictions on Background")
John suggested we explore the possibility of two different point processes for yellow fever spillover in Brazil. We split the country into two region- the northwest with high primate diversity (\(\geq 6\) of 22) and the remaining portion with low primate diversity. A threshold of 6 species produced the most contiguous region of high primate diversity. The previous iterations of this work was using a temporially balanced split for training and testing data. This split was redone to create a balanced split in 3 dimensions (x space, y space and time). The newly space and time balanced data for the whole country (used for the one model approach) was then split into high and low non-human primate diversity regions (used for the two model approach).
Models consisted of 500 bags and 10 permutations unless otherwise stated. This bag/permutation scheme was a based on computational time and memory.
## Reading layer `BRAZpolygons' from data source `/Users/renikaul/Documents/YF_Brazil/data_clean' using driver `ESRI Shapefile'
## Simple feature collection with 5561 features and 2 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -73.99442 ymin: -33.75206 xmax: -28.83588 ymax: 5.271807
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
Models were built using all of the covariates (full.formula) and just environmental covariates (popless.formula).
full.formula <- as.formula("case~ NDVI+NDVIScale+
popLog10+
RF+RFsqrt+RFScale+
tempMean+tempScale+
fireNum+fireDen+fireDenSqrt+fireDenScale+
spRich+primProp")
popless.formula <- as.formula("case~ NDVI+NDVIScale+
RFsqrt+RFScale+
tempMean+tempScale+
fireDenSqrt+fireDenScale+
spRich+primProp")
#load all the data
STPermFullModel <- readRDS("../data_out/SpaceAndTimeSplit/PermFullModel500.rds")
STPermPoplessModel <- readRDS("../data_out/SpaceAndTimeSplit/PermPoplessModel500.rds")
STFull<- RankImp(STPermFullModel)
STPopless<- RankImp(STPermPoplessModel)
STPermSFModel <- readRDS("../data_out/SpaceAndTimeSplit/PermSFModel500.rds")
#Temperal split
PermFullModel <- readRDS("../data_out/TempSplit/PermFullModel.rds")
PermPoplessModel <- readRDS("../data_out/TempSplit/PermPoplessModel.rds")
STFull<- RankImp(STPermFullModel)
Full <- RankImp(PermFullModel)
STPopless<- RankImp(STPermPoplessModel)
STSF <- RankImp(STPermSFModel)
#This is the important chunck of the code
VariRank <- STFull %>%
select(c(Model, STFull=Rank)) %>%
dplyr::left_join(Full %>% select(c(Model, TFull=Rank)), by="Model")